#############################################################
## Replication file for Ban, Palmer and Schneer 2019       ##
## file: format_rd.R                                       ##
## date: 5/16/2019                                         ##
#############################################################

rm(list=ls())

#install relevant packages
#install.packages(c('foreign','data.table','haven','stringdist'))

#load packages
require(foreign)
require(data.table)
require(haven)
require(stringdist)

#load relevant functions
splitit<-function(x,splitchar,n) sapply(strsplit(as.character(x), splitchar), "[[", n)

#set working directory
setwd("/nfs/home/B/bschneer/shared_space/bschneer/lobbying_replication")

#load candidate and lobbyist data
load('usr_gen/candidates.RData')
load('usr_gen/lobbyist_final.RData')

#compile list files into one data set
candidates <- rbindlist(candidates,fill=T)

#calculate vote margin
candidates[,vote_margin:=vote_share_cand-0.5]


#identify races w/ two members of same party
candidates[is.na(two_dems),two_dems:=0]
candidates[is.na(two_repubs),two_repubs:=0]

lobbyists <- copy(lobbyist.final[['all']])

lobbyists[,`:=`(house=NULL,sen=NULL,period=NULL,period_label=NULL,three_component_index=NULL,news_epu=NULL,news_epu_12ma=NULL,n_years=NULL,staffer=NULL,congress_current=NULL,congress_com=NULL,congress_expol=NULL,congress_pol_98=NULL,is_firm=NULL)]

candidates <- lobbyists[candidates,on='lobbyist_id']

#set NAs to zeros where relevant
vars <- c('lobbying_reports','fe','fe_lim','amount', 'amount_per_lobbyist', 'amount_weighted_lim','amount_cy', 'amount_per_lobbyist_cy', 'amount_yrly','amount_weighted_lim_yrly','amount_per_lobbyist_yrly')

for (item in vars) candidates[is.na(get(item)),(item):=0]

candidates[grep("D",party),party:="D"]
candidates[grep("R",party),party:="R"]



#Candidate characteristics

#import candidate characteristics data for RD
legs <- fread("inputs/congress_legislators.csv")[,.(cand_name=toupper(paste(last,", ",first," ",middle,sep="")),last,first, middle,gender)]

legs.names0<- unique(legs[,.(first=toupper(first),gender)])

legs.names.dups<-legs.names0[(duplicated(first)|duplicated(first,fromLast=T))]
legs.names<-legs.names0[!(duplicated(first)|duplicated(first,fromLast=T))]

cname <- candidates[,trimws(sapply(cand_name,splitit,",",2))]
candfirst <- substr(cname,1,ifelse(regexpr(" ",cname)==-1,100,regexpr(" ",cname)-1))

candidates[,first:=candfirst]

candidates <- legs.names[candidates,on='first']

#identify duplicates
dups<-legs[toupper(first) %in% unique(legs.names.dups$first),.(cand_name,gender)]

#fuzzy string match
match.mx <- stringdistmatrix(candidates$cand_name,dups$cand_name,method='jw',p=0)

match.mx2<-data.table(row=apply(match.mx,2,which.min),score=apply(match.mx,2,min))
match.mx2[score>.1,row:=NA]

for (item in 1:nrow(match.mx2)){
row0<-match.mx2[item,row]
candidates[row0][is.na(gender) & match.mx2$score[item]<=.1,gender:=dups$gender[item]]
}

#for unmatched data, use social security admin name probabilities
load('inputs/ssa_national.rda'); setDT(ssa_national)

for (item in candidates[,which(is.na(gender))]){

cfname <- candidates[item,first]
cyear <- as.numeric(candidates[item,year]-30)

if(nrow(ssa_national[name==tolower(cfname) & year<=cyear])>0){
cgender <- ifelse(ssa_national[name==tolower(cfname) & year<=cyear,as.logical(sum(male)/sum(male,female)>.5)],"M","F")

candidates[item,gender:=cgender]
} else{
print(item)
}

}

#write.csv(candidates[is.na(gender),cand_name],file='missing_cand_names.csv',row.names=F)

#Import corrected missing candidate names
cand.names2 <- fread('inputs/missing_cand_names_completed.csv')
setnames(cand.names2,'gender','gender.update')

candidates <- cand.names2[candidates,on='cand_name']
candidates[is.na(gender) & !is.na(gender.update),gender:=gender.update]
candidates[,gender.update:=NULL]

#Load state data
state.lookup <- fread('inputs/state_lookup.csv')

#merge states to candidates
candidates <- state.lookup[,.(state,census_region)][candidates,on='state']

#output candidate data
write_dta(candidates,"usr_gen/candidates.dta")
